library(knitr)
library(plyr)
library(dplyr)
library(tidyr)
library(caret)
library(ggplot2)
library(corrplot)
library(stringr)
library(scales)
library(randomForest)
library(psych)
library(glmnet)
library(rpart)
library(lubridate)
library(plotly)
opts_chunk$set(echo = TRUE, cache = TRUE)
opts_chunk$set(tidy.opts = list(width.cutoff = 60), tidy = TRUE)
dim(games_detail_df)
## [1] 645953 29
str(games_detail_df)
## 'data.frame': 645953 obs. of 29 variables:
## $ GAME_ID : int 22101005 22101005 22101005 22101005 22101005 22101005 22101005 22101005 22101005 22101005 ...
## $ TEAM_ID : int 1610612750 1610612750 1610612750 1610612750 1610612750 1610612750 1610612750 1610612750 1610612750 1610612750 ...
## $ TEAM_ABBREVIATION: chr "MIN" "MIN" "MIN" "MIN" ...
## $ TEAM_CITY : chr "Minnesota" "Minnesota" "Minnesota" "Minnesota" ...
## $ PLAYER_ID : int 1630162 1630183 1626157 1627736 1626156 1629675 1629162 1629669 1627752 1629006 ...
## $ PLAYER_NAME : chr "Anthony Edwards" "Jaden McDaniels" "Karl-Anthony Towns" "Malik Beasley" ...
## $ NICKNAME : chr "Anthony" "Jaden" "Karl-Anthony" "Malik" ...
## $ START_POSITION : chr "F" "F" "C" "G" ...
## $ COMMENT : chr "" "" "" "" ...
## $ MIN : chr "36:22" "23:54" "25:17" "30:52" ...
## $ FGM : num 4 6 4 4 3 3 2 6 3 0 ...
## $ FGA : num 10 8 9 9 13 8 5 13 8 0 ...
## $ FG_PCT : num 0.4 0.75 0.444 0.444 0.231 0.375 0.4 0.462 0.375 0 ...
## $ FG3M : num 3 1 1 4 1 1 0 2 2 0 ...
## $ FG3A : num 8 3 3 9 6 2 1 5 5 0 ...
## $ FG3_PCT : num 0.375 0.333 0.333 0.444 0.167 0.5 0 0.4 0.4 0 ...
## $ FTM : num 4 1 6 0 7 4 1 2 3 0 ...
## $ FTA : num 4 1 8 0 7 4 1 2 5 0 ...
## $ FT_PCT : num 1 1 0.75 0 1 1 1 1 0.6 0 ...
## $ OREB : num 0 2 1 0 0 3 0 0 0 0 ...
## $ DREB : num 8 4 9 3 6 7 1 0 2 0 ...
## $ REB : num 8 6 10 3 6 10 1 0 2 0 ...
## $ AST : num 5 0 0 1 9 1 3 1 1 0 ...
## $ STL : num 3 0 0 1 1 3 3 0 1 0 ...
## $ BLK : num 1 2 0 0 0 2 0 0 0 0 ...
## $ TO : num 1 2 3 1 5 1 0 0 1 0 ...
## $ PF : num 1 6 4 4 0 1 1 0 2 0 ...
## $ PTS : num 15 14 15 12 14 11 5 16 11 0 ...
## $ PLUS_MINUS : num 5 10 14 20 17 -7 -10 -5 1 0 ...
dim(games_df)
## [1] 25796 21
str(games_df)
## 'data.frame': 25796 obs. of 21 variables:
## $ GAME_DATE_EST : chr "2022-03-12" "2022-03-12" "2022-03-12" "2022-03-12" ...
## $ GAME_ID : int 22101005 22101006 22101007 22101008 22101009 22101010 22101011 22100995 22100996 22100997 ...
## $ GAME_STATUS_TEXT: chr "Final" "Final" "Final" "Final" ...
## $ HOME_TEAM_ID : int 1610612748 1610612741 1610612759 1610612744 1610612743 1610612762 1610612757 1610612753 1610612737 1610612738 ...
## $ VISITOR_TEAM_ID : int 1610612750 1610612739 1610612754 1610612749 1610612761 1610612758 1610612764 1610612750 1610612746 1610612765 ...
## $ SEASON : int 2021 2021 2021 2021 2021 2021 2021 2021 2021 2021 ...
## $ TEAM_ID_home : int 1610612748 1610612741 1610612759 1610612744 1610612743 1610612762 1610612757 1610612753 1610612737 1610612738 ...
## $ PTS_home : num 104 101 108 122 115 134 127 118 112 114 ...
## $ FG_PCT_home : num 0.398 0.443 0.412 0.484 0.551 0.558 0.516 0.465 0.478 0.467 ...
## $ FT_PCT_home : num 0.76 0.933 0.813 0.933 0.75 0.71 0.909 0.88 0.895 0.8 ...
## $ FG3_PCT_home : num 0.333 0.429 0.324 0.4 0.407 0.39 0.367 0.4 0.29 0.188 ...
## $ AST_home : num 23 20 28 33 32 21 21 31 28 23 ...
## $ REB_home : num 53 46 52 55 39 44 43 49 47 47 ...
## $ TEAM_ID_away : int 1610612750 1610612739 1610612754 1610612749 1610612761 1610612758 1610612764 1610612750 1610612746 1610612765 ...
## $ PTS_away : num 113 91 119 109 127 125 118 110 106 103 ...
## $ FG_PCT_away : num 0.422 0.419 0.489 0.413 0.471 0.5 0.47 0.456 0.488 0.422 ...
## $ FT_PCT_away : num 0.875 0.824 1 0.696 0.76 0.857 0.963 1 0.824 0.958 ...
## $ FG3_PCT_away : num 0.357 0.208 0.389 0.386 0.387 0.394 0.412 0.333 0.375 0.294 ...
## $ AST_away : num 21 19 23 27 28 27 26 24 22 21 ...
## $ REB_away : num 46 40 47 39 50 33 35 37 36 42 ...
## $ HOME_TEAM_WINS : int 0 1 0 1 0 1 1 1 1 1 ...
length(unique(games_df$GAME_ID[(games_df$TEAM_ID_home == teams_df$TEAM_ID[which(teams_df$ABBREVIATION ==
"GSW")] | games_df$TEAM_ID_away == teams_df$TEAM_ID[which(teams_df$ABBREVIATION ==
"GSW")]) & games_df$SEASON == 2019]))
## [1] 70
From the data set, Golden State Warriors played 70 games in season 2019-2020, while in fact, they only played 65 games. The data set seems to record every game including preseasons and playoff
temp <- games_df
temp$GAME_DATE_EST <- as.POSIXlt(games_df$GAME_DATE_EST, "%Y-%m-%d",
tz = "EST")
temp$Month_EST <- month(temp$GAME_DATE_EST)
temp$DAY_EST <- day(temp$GAME_DATE_EST)
temp$YEAR_EST <- year(temp$GAME_DATE_EST)
temp$MONTH_DAY <- mday(temp$GAME_DATE_EST)
temp$DAYOFYEAR <- yday(temp$GAME_DATE_EST)
sum(temp$DAYOFYEAR > 72 & temp$DAYOFYEAR < 274)
## [1] 6334
range(temp$DAYOFYEAR)
## [1] 1 366
Game distribution throughout the year (ignore 2021-2022 season as it is incomplete)
ggplot(temp, aes(x = DAYOFYEAR)) + geom_histogram(binwidth = 1)
fig1 <- plot_ly(x = ~temp$DAYOFYEAR[!temp$SEASON == 2021], type = "histogram",
nbinsx = 366)
fig1